# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
# read in and explore the dataset
loans = pd.read_csv('prosperLoanData.csv')
loans.head()
loans.shape
# since there are a lot of columns, let's see the columns 10 at a time in order to better understand the data
loans.loc[:,loans.columns[0:10]].sample(5)
loans.loc[:,loans.columns[10:20]].sample(5)
loans.loc[:,loans.columns[20:30]].sample(5)
loans.loc[:,loans.columns[30:40]].sample(5)
loans.loc[:,loans.columns[40:50]].sample(5)
loans.loc[:,loans.columns[50:60]].sample(5)
loans.loc[:,loans.columns[60:70]].sample(5)
loans.loc[:,loans.columns[70:81]].sample(5)
# get info about the different columns
loans.info()
# What date range are we looking at?
loans.loc[:,'LoanOriginationDate'] = pd.to_datetime(loans['LoanOriginationDate'])
print(loans.LoanOriginationDate.min())
print(loans.LoanOriginationDate.max())
# Trying to understand Interest Rates and Yield.
# Are these rates always in the order BorrowerAPR > BorrowerRate > LenderYield
print(loans.query('BorrowerAPR < BorrowerRate').shape)
print(loans.query('BorrowerRate < LenderYield').shape)
# Understand the 2 rating systems
print(loans.loc[:,'CreditGrade'].unique())
print(loans.loc[:,'ProsperRating (Alpha)'].unique())
# Is there much difference between listing creation and loan origination?
# loans.loc[:,['ListingCreationDate', 'LoanOriginationDate']].sample(10)
listing_to_loan_dates = pd.to_datetime(loans['LoanOriginationDate']) - pd.to_datetime(loans['ListingCreationDate'])
print(listing_to_loan_dates.max(), '\n', listing_to_loan_dates.min(), '\n', listing_to_loan_dates.mean())
# What were the occupations specified
loans.loc[:,'Occupation'].unique()
# What were the employment statuses specified?
loans.loc[:,'EmploymentStatus'].value_counts()
# How does the data look for Listing Categories
loans.loc[:,'ListingCategory (numeric)'].value_counts()
# What is the spread of interest rates?
loans[['BorrowerAPR', 'BorrowerRate']].describe()
# does varifiable income matter as a factor?
loans['IncomeVerifiable'].value_counts()
The Loans dataset contains 113937 records with 81 data points about each loan. Some of the salient points that can be observed about the dataset are as below
In fact a good number of fields do not have data in all the rows as they are either dependent on the time (pre 2009 vs post), or on whether the borrower had a prior credits or relationship with prosper.
We would like to focus on the APR of the Borrower and what are the factors that affect the APR
Some of the data points that are related to the APR will of course be the credit scores and credit ratings of the borrower. The rate might also be dependant on the loan tenure and loan amounts that have been opted for. The rate could also depend on the employment status and income of the borrower. The final thing that would be interesting would be to see if the interest rates change with calendar.
loans_apr = loans.copy()
# let's now only take those fields that we are interested in. We want to explore factors that affect the Borrower APR
columns = ['CreditGrade', 'Term', 'BorrowerAPR', 'ProsperRating (Alpha)', 'EmploymentStatus', 'EmploymentStatusDuration', 'IsBorrowerHomeowner', 'CreditScoreRangeLower', 'CreditScoreRangeUpper', 'CurrentCreditLines', 'DebtToIncomeRatio', 'StatedMonthlyIncome', 'LoanOriginalAmount', 'LoanOriginationQuarter']
loans_apr = loans_apr.filter(items=columns)
loans_apr.shape
loans_apr.describe()
loans_apr.info()
# Remove missing data in BorrowerAPR
loans_apr = loans_apr.dropna(subset=['BorrowerAPR'])
# Drop CreditGrade value NC
loans_apr = loans_apr.query('CreditGrade != "NC"')
# Let's merge the credit rating data for pre 2009 and post 2009.
loans_apr.loc[:,'CreditRating'] = loans_apr['ProsperRating (Alpha)'].combine_first(loans_apr['CreditGrade'])
loans_apr = loans_apr.dropna(subset=['CreditRating'])
loans_apr['CreditRating'].isna().sum()
# Drop the superfluous columns now
loans_apr = loans_apr.drop(columns=['ProsperRating (Alpha)', 'CreditGrade'])
# Now convert the CreditRatings into an ordinal type
credit_ratings = ['HR', 'E', 'D', 'C', 'B', 'A','AA']
credit_type = pd.api.types.CategoricalDtype(credit_ratings, ordered=True)
loans_apr.loc[:,'CreditRating'] = loans_apr['CreditRating'].astype(credit_type)
loans_apr['CreditRating'].unique()
# Remove entries with 0 CreditScoreRange
loans_apr = loans_apr.query('CreditScoreRangeLower > 0')
# Convert credit score to average
loans_apr.loc[:,'CreditScoreRangeAverage'] = (loans_apr['CreditScoreRangeLower'] + loans_apr['CreditScoreRangeUpper']) / 2
loans_apr.loc[:,'CreditScoreRangeAverage'] = loans_apr['CreditScoreRangeAverage'].apply(np.floor)
loans_apr.head()
# Drop the extra columns
loans_apr = loans_apr.drop(columns=['CreditScoreRangeLower', 'CreditScoreRangeUpper'])
# and convert to int type
loans_apr.loc[:, 'CreditScoreRangeAverage'] = loans_apr['CreditScoreRangeAverage'].astype(int)
# Split the loan origination quarter columns into quarters and years
loans_apr[['LoanOriginationQuarter', 'LoanOriginationYear']] = loans_apr['LoanOriginationQuarter'].str.split(expand=True)
loans_apr.loc[:,'LoanOriginationYear'] = loans_apr['LoanOriginationYear'].astype(int)
loans_apr.head()
# Make the Quarters as ordinal type
quarters = ['Q1', 'Q2', 'Q3', 'Q4']
quarter_type = pd.api.types.CategoricalDtype(quarters, ordered=True)
loans_apr.loc[:,'LoanOriginationQuarter'] = loans_apr['LoanOriginationQuarter'].astype(quarter_type)
loans_apr.loc[:,'LoanOriginationQuarter'].unique()
# Above checks means that all NaN status are in the status Not available and so we'll set it accordingly.
loans_apr = loans_apr.fillna(value={'EmploymentStatus':'Not available'})
loans_apr.loc[:,'EmploymentStatus'].value_counts()
# We'll come up with an order for risk profile based on employment status.
# Graphs later on will corroborate whether we made the right assumptions
emp_statuses = ['Not available', 'Not employed', 'Retired', 'Part-time', 'Other', 'Self-employed', 'Employed', 'Full-time']
emp_status_type = pd.api.types.CategoricalDtype(emp_statuses, ordered=True)
loans_apr.loc[:,'EmploymentStatus'] = loans_apr['EmploymentStatus'].astype(emp_status_type)
loans_apr.EmploymentStatus.unique()
# Would like to round the monthly income to integers as well
loans_apr.loc[:,'StatedMonthlyIncome'] = loans_apr['StatedMonthlyIncome'].astype(int)
loans_apr.head()
# Checking the fields
loans_apr.info()
# finally describe this subset of data
loans_apr.describe()
loans_apr.to_csv('prosperLoanData_filtered.csv', index=False)
plt.hist(loans_apr['Term'])
plt.show();
Observations: There are only 3 term periods that all loans fall into - 12, 36, and 60 months with ~80% of listings opting for a 36 month loan period. Any further investigation on this variable will probably be skewed.
bin_edges = np.arange(0,0.55,0.01)
plt.hist(loans_apr['BorrowerAPR'], bins=bin_edges)
plt.show();
print(loans_apr['BorrowerAPR'].mode())
print()
print(loans_apr['BorrowerAPR'].value_counts().head())
print()
print(loans_apr.query('BorrowerAPR > 0.35 & BorrowerAPR < 0.36').shape[0])
Observations: The distribution of APR is mostly normal except for a specific APR which has a peak at around .36. This indicates that there is a good range of interest rates available and it will be interesting to note which other variables correlate to the BorrowerAPR.
The 3 highest occurances of APR are at 0.35797, 0.35643, 0.37453. Let us take the max range to be 0.35 to 0.36 for future analysis
print(np.log10((loans_apr['EmploymentStatusDuration'] + 1).max()))
print(np.log10((loans_apr['EmploymentStatusDuration'] + 1).min()))
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
bin_edges = np.arange(0, loans_apr['EmploymentStatusDuration'].max()+12, 12)
plt.hist(loans_apr['EmploymentStatusDuration'], bins = bin_edges)
plt.subplot(1,2,2)
bin_edges_log = 10 ** np.arange(0, np.log10((loans_apr['EmploymentStatusDuration']+1).max()) + 0.1, 0.1)
# plt.hist(loans_apr.loc[loans_apr['EmploymentStatusDuration'] > 0,'EmploymentStatusDuration'] + 1, bins = bin_edges_log)
plt.hist(loans_apr.loc[:,'EmploymentStatusDuration'] + 1, bins = bin_edges_log)
plt.xscale('log')
xticks = [1,3,10,30,50,100,300,1000]
xlabels = np.asarray(xticks) - 1
plt.xticks(xticks, xlabels)
plt.show();
Observations: The intitial distributions shows a highly right skewed plot. This is due to the extremely high incidence of no or very low employment durations among the population. Upon taking a log scale on the x-axis we see a largely normal distribution of employment durations with a peak between 60 to 120 months.
bin_edges = np.arange(0,55,1)
plt.hist(loans_apr.loc[:, 'CurrentCreditLines'], bins = bin_edges)
plt.show();
Observations: We see a slightly right skewed distribution peaking at 10.
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
bin_edges = np.arange(0, loans_apr.DebtToIncomeRatio.max()+.05,.05)
plt.hist(loans_apr['DebtToIncomeRatio'], bins=bin_edges)
plt.xlim(0,1.5)
plt.subplot(1,2,2)
bin_edges_log = 10 ** np.arange(0, np.log10((loans_apr['DebtToIncomeRatio']+1).max())+.01,.01)
# plt.hist(loans_apr.loc[loans_apr['DebtToIncomeRatio'] > 0, 'DebtToIncomeRatio']+1, bins=bin_edges_log)
plt.hist(loans_apr.loc[:,'DebtToIncomeRatio']+1, bins=bin_edges_log)
plt.xscale('log')
xticks = [1,1.2,1.5,1.65, 2,3]
xlabels = [0,0.2,0.5, 0.65, 2, 3]
plt.xticks(xticks, xlabels)
plt.xlim(1,1.8)
plt.show();
(loans_apr.DebtToIncomeRatio < .65).sum() / loans_apr.shape[0]
Observations: Again a very right skewed distribution curve. Most of the values seems to be concentrated around 0.2. About 95% of the population has a DebtToIncomeRatio of <= 0.5 and 98% < 0.65. Hence this distribution seems representative of the population.
bin_edges = np.arange(0,loans_apr.StatedMonthlyIncome.max()+500, 500)
plt.hist(loans_apr.loc[:, 'StatedMonthlyIncome'], bins=bin_edges)
plt.xlim(0,16500)
plt.show();
(loans_apr.StatedMonthlyIncome < 16500).sum() / loans_apr.shape[0]
Observations: Extremely right skewed distribution of monthly incomes. It is normal in the range of 0-15000 and then tapers off sharply thereafter. However 97% of the data points reside within this range, so the remaining could be treated as outliers and the graph plotted is representative of the population.
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
bin_edges = np.arange(0,36000,1000)
plt.hist(loans_apr['LoanOriginalAmount'], bins=bin_edges)
plt.subplot(1,2,2)
bin_edges = 10 ** np.arange(3,np.log10(35000)+0.05,0.05)
plt.hist(loans_apr['LoanOriginalAmount'], bins=bin_edges)
plt.xscale('log')
xticks = [1000,1500,2000,3000,4000,5000,10000,15000,20000,25000,30000]
xlabels=['1k','1.5','2k','3k','4k','5k','10k','15k','20k','','30k']
plt.xticks(xticks, xlabels)
plt.show();
Observations: The original plot without any transformation is right skewed, however there are interesting patterns of peaks at predictable intervals. Upon taking the log transformation, the peaks are very easily identifiable on an otherwise normal distribution. The peaks are at round figures for loan amounts of 2k, 3k, 4k, 10k, 15k.
bin_edges = np.arange(0,loans_apr.CreditScoreRangeAverage.max()+10,10)
plt.hist(loans_apr['CreditScoreRangeAverage'], bins=bin_edges)
plt.xlim(400,900)
plt.show();
Observations:: Mostly normal distribution around the range where maximum data is present, maybe slightly left skewed. Most credit scores are distributed between 650 and 750. This shows that individuals with good credit scores are probably more on the prosper loan platform.
base_color = sb.color_palette()[0]
sb.countplot(data=loans_apr, x='CreditRating', color=base_color)
plt.show();
Observations: Interestingly this distribution is fairly normal. A large number of loans have been given out where the rating was between D and B. I would have expected this to be left skewed and very few loans to have been given to those with lower credit ratings.
sb.countplot(data=loans_apr, x='LoanOriginationYear', color=base_color)
plt.show();
Observations: Data in 2005 and 2014 is not available for the whole year. Other than that, there seems to be a huge lull in 2009 (maybe as a result of the lending crisis) and post that the number of loans offered is quadratically increasing. There doesn't seem to be much significance excapt maybe to see whether lending rate are increasing / decreasing over time?
sb.countplot(data=loans_apr, x='LoanOriginationQuarter', color=base_color)
plt.show();
Observations: Nothing significant here. Loans given out in every quarter is mostly identical. I would have expected more loans to have been given / taken during year ending / calendar endings.
sb.countplot(data=loans_apr, x='IsBorrowerHomeowner', color=base_color)
plt.show();
Observations: The distribution is evenly mathched between home owners and non home owners. This is very surprising. I expected a majority of loan takers to also be home owners. It will be interesting to see how the APR correlates to this variable.
plt.figure(figsize=(20,5))
sb.countplot(data=loans_apr, x='EmploymentStatus', color=base_color)
plt.show();
Observations: Most applicants are Employed or Full-time employed. I would expect interest rates to be lower for employed individuals.
The BorrowerAPR was generally normally distributed. A clear distribution was observed without having to do any sort of transformations. What was unusual in this distribution was that there was a rather large peak at about 0.36 that was standing out within this otherwise normal distribution.
# continuous numeric variables
var = 'BorrowerAPR'
cont_feats = ['EmploymentStatusDuration', 'DebtToIncomeRatio', 'LoanOriginalAmount', 'CurrentCreditLines', 'StatedMonthlyIncome', 'CreditScoreRangeAverage']
# discrete numeric vars
disc_feats = ['Term', 'LoanOriginationYear']
# Categoric vars
cat_feats = ['EmploymentStatus','IsBorrowerHomeowner', 'LoanOriginationQuarter', 'CreditRating']
loans_apr_nz = loans_apr.copy()
# Select desired ranges of the data especially in skewed columns
# loans_apr_nz = loans_apr_nz.query('CurrentCreditLines < 30')
loans_apr_nz = loans_apr_nz.query('DebtToIncomeRatio < 0.65')
loans_apr_nz = loans_apr_nz.query('StatedMonthlyIncome > 0 and StatedMonthlyIncome < 16500')
loans_apr_nz = loans_apr_nz.query('CreditScoreRangeAverage > 400 and CreditScoreRangeAverage < 900')
# Adjusting these two fields for log transformations by moving it equaly away from 0
loans_apr_nz.loc[:,'EmploymentStatusDuration'] = loans_apr_nz['EmploymentStatusDuration'] + 1
loans_apr_nz.loc[:,'DebtToIncomeRatio'] = loans_apr_nz['DebtToIncomeRatio'] + 1
loans_apr_nz.loc[:,'CurrentCreditLines'] = loans_apr_nz['CurrentCreditLines'] + 1
# verifying that I've not filtered out all of this data
loans_apr_nz.query('EmploymentStatus == "Not available"').shape[0]
# Take a random sample since there are a lot of variables.
loans_apr_sample = loans_apr_nz.sample(10000)
plt.figure(figsize=(15,15))
i = 1
for feat in cont_feats:
plt.subplot(2,3,i)
# xseries = np.log10(loans_apr_sample[feat] + 1)
xseries = loans_apr_sample[feat]
plt.scatter(y=loans_apr_sample[var], x=xseries, alpha=0.1)
plt.ylabel('Borrower APR')
plt.xlabel(feat)
if feat in ['EmploymentStatusDuration', 'DebtToIncomeRatio', 'LoanOriginalAmount']:
plt.xscale('log')
i = i+1
plt.show();
Employment Status Duration: We don't see any significant direct correlation between this data and the APR. The only observation is that a lot of loans are taken by the individuals who have been working for a longer time. Zero values have no significance.Debt to Income ratio: Again there isn't a significant correlation that we can observe. The incidence of loans taken just gets lesser with increase in this ratio. Zero values have no significance.Loan original amount: Except for the high density of loans taken in the intervals that we had determined earlier, there is no other correlation that can be observed here. Although it can be seen that the range of APRs (specifically max APR) are reducing with increase in loan amount, however even lower amounts have instances of very low APR. This indicates that there are probably other factors having a direct influence. Zero values have no significance.Current Credit lines: No correlation can be noticed here as well. The only observation is that the number of loans taken by those with lower current credit lines far exceed the number of loans taken by those with already many credit lines - which sort of makes sense as these people have already a high credit burden and don't want to or can't take up more. Zero values have no significance.Monthly Income: There seems to be some sort of negative correlation here, but on observing more keenly we can see that just hte data gets more sparse with higher monthly incomes and it is not that the APR has any significant change.Average Credit Score Rating: Now this is where we see a very clear correlation as the APR seems to be decreasing with increase in Average Credit Scores. This is a feature that we could explore in more detail as we go along.# using the same sample set from the previous charts for consistency in comparing
plt.figure(figsize=(15,10))
i = 1
for feat in disc_feats:
plt.subplot(2,1,i)
sb.violinplot(data=loans_apr_nz, x=feat, y='BorrowerAPR', color=base_color, inner='quartile')
i = i + 1
plt.show();
plt.figure(figsize=(15,5))
sb.barplot(data=loans_apr_nz, x='LoanOriginationYear', y='BorrowerAPR', color=base_color, ci='sd');
Term: What can be seen from the chart is that lower terms tend to have a higher APR, and the higher terms seems to have a lower APR, just judging by the modes on the violinplots. It can also be seen that the term of 36 months has many modes and a wider spread of APR. From the univariate plot of the Term, we remember that 80% of the loans are taken with 36 months as the term and hence there is disproportionately high amount of data points in this value and hence no inference can be made from this chart.Loan Origination Year: An interesting trend that we can see from here is that average APR is seen to be going up through the years, peaking in 2011 and then has started going back down over the next years. Another observation is that in the last 2 years, the distribution of APR is more concentrated near the median were in the earlier years the spread is larger and we see modes closer to the Upper and Lower Quartiles. While this is interesting, this seems more of a general market tread and not something that we can use to determine the APR.# using the same sample set from the previous charts for consistency in comparing
plt.figure(figsize=(15,20))
i = 1
for feat in cat_feats:
plt.subplot(4,1,i)
sb.boxplot(data=loans_apr_nz, x=feat, y='BorrowerAPR', color=base_color)
i = i + 1
plt.show();
Employment Status: This doesn't show any clear correlation. Those who are full-time employed seem to have a lower average APR, but then the ones who are retired seem to have an even lower interest rate. So this chart is clearly not conclusive.Is Homeowner: On an average, the APR for homeowners seems to be a few points lesser than the APR for non-homeowners. This is something that could be investigated further.Loan Origination Quarter: Loans in Q1 seem to carry a lower APR than loans in Q3 which has the highest averages. Credit Rating: This is where we can see a clear correlation. As the credit rating increases, the APR reduces substantially and even the spread of APR reduces. CreditRating NA has a very wide spread which tends to indicate that other factors are then considered for the APR. There are a lot of outliers in this plot and would be interesting to find some of these other factors that go into deciding the APR.We will ignore the discrete numeric variables from now on as it seems to not have much impact on the variable of intesrest.
# Let's start with a heatmap
plt.figure(figsize=(10,5))
sb.heatmap(loans_apr_nz[cont_feats].corr(), annot=True, fmt='.3f', cmap='vlag_r', center=0)
plt.show();
# getting a sample of the data
loans_apr_sample = loans_apr_nz.sample(1000)
# Now we'll go for a pairgrid scatter plot
g = sb.PairGrid(data=loans_apr_sample, vars=cont_feats)
g.map(plt.scatter, alpha=0.4)
for ax in g.axes.flat:
if ax.get_xlabel() in ['EmploymentStatusDuration', 'DebtToIncomeRatio', 'LoanOriginalAmount']:
ax.set_xscale('log')
if ax.get_ylabel() in ['EmploymentStatusDuration', 'DebtToIncomeRatio', 'LoanOriginalAmount']:
ax.set_yscale('log')
plt.show();
Current Credit Lines vs Debt to Income Ratio: This is a very obvious correlation in that the Debt to Income ratio is fairly strongly correlated with the number of Credit Lines that the individual has.Monthly Income vs Loan Amount: Again an obvious correlation is emerging here. As the Monthly Income increases, the individual is looking for larger loan amounts. Monthly Income vs Current Credit Lines: Similarly with increasing income, there seems to be a trend to have more Credit Lines open.Credit Score vs Loan Amount: We can see that individuals with higher credit scores are also seeking larger loans.Employment Duration vs Monthly Income: Fairly obvious also is that the Monthly Income is higher for individuals who have worked longer.# plot against isHomeOwner can be separate as it is a small chart
plt.figure(figsize=(20,10))
num_feats = len(cont_feats)
for i, feat in zip(range(1,num_feats+1), cont_feats):
plt.subplot(2, (num_feats+1)/2, i)
sb.boxplot(data=loans_apr_nz, x='IsBorrowerHomeowner', y=feat, color=base_color)
plt.show();
HomeOwners: Those who have longer Employment Status Durations have a higher chance of owning a house. Debt to Income ratio doesn't seem to matter much, however the homeowners have higher income. Homeowners seem to be able to borrow a higher loan amount, but being a homeowner also increases the liklihood of opening up more credit lines. Finally credit ratings are obviously higher for home owners.
The Interesting observation from this and the previous charts is that homeowners seem to have a slight advantage at getting lower APRs, however they are also the ones that are taking up more loans and larger loans - yet they get lower interests. This has to be explored.
# plotting other categories now. Plotting the correlations individually instead of using a pairgrid
# As there are many categories to be displayed and hence for better control
plt.figure(figsize=(20,50))
i = 1
for cfeat in cat_feats:
if cfeat == 'IsBorrowerHomeowner':
continue
for feat in cont_feats:
plt.subplot(9, 2, i)
sb.boxplot(data=loans_apr_nz, x=cfeat, y=feat, color=base_color)
i = i + 1
plt.show();
In general this data is either all over the place (such as that for Employment Status) or is not conclusive.
Credit Rating is the one that seem to have a strong correlation with the other data points.
# Homeowner is best plotted separately as it has only 2 categories
plt.figure(figsize=(10,15))
i = 1
for feat in cat_feats:
if feat == 'IsBorrowerHomeowner':
continue
plt.subplot(3,1,i)
sb.countplot(data=loans_apr_nz, x=feat, hue='IsBorrowerHomeowner', palette='Blues')
plt.legend(loc='upper right', title='homeowner?')
i = i + 1
plt.show();
# plotting the remaining ones now individually
plt.figure(figsize=(15,15))
plt.subplot(4,1,1)
sb.countplot(data=loans_apr_nz, x='CreditRating', hue='LoanOriginationQuarter', palette='Blues')
plt.legend(loc='upper left', title='Quarter')
plt.subplot(4,1,2)
sb.countplot(data=loans_apr_nz, x='CreditRating', hue='EmploymentStatus', palette='Blues')
plt.legend(loc='upper left', title='EmploymentStatus')
plt.subplot(4,1,3)
sb.countplot(data=loans_apr_nz, x='EmploymentStatus', hue='LoanOriginationQuarter', palette='Blues')
plt.legend(loc='upper left', title='Quarter')
plt.subplot(4,1,4)
sb.countplot(data=loans_apr_nz, x='EmploymentStatus', hue='CreditRating', palette='Blues')
plt.legend(loc='upper left', title='CreditRating')
plt.show();
There are some very good observations to be seen here.
CreditRating is high for Employed and full-time employed individuals whereas the others have relatively low ratings.Loan origination quarter doesn't seem to show any significant patterns and is in general all over the place.HomeOwner - this is very interesting in some aspects. The non-homeowners tend to have a high distribution among the lower credit rating categories, whereas for the higher rating categories the number of homeowners are more. This is expected in hindsight, but is very interesting to see in visuals.Employment Status - Those will not available have a reducing correlation as the credit rating increases.# Average loan sizes by Credit Rating
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sb.barplot(data=loans_apr_nz, x='CreditRating', y='LoanOriginalAmount', color=base_color, ci=95);
plt.subplot(1,2,2)
sb.barplot(data=loans_apr_nz, x='CreditRating', y='LoanOriginalAmount', estimator=np.sum, color=base_color);
# Average loan sized by Homeowner Status
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
sb.barplot(data=loans_apr_nz, x='IsBorrowerHomeowner', y='LoanOriginalAmount', color=base_color, ci=95);
plt.subplot(1,2,2)
sb.barplot(data=loans_apr_nz, x='IsBorrowerHomeowner', y='LoanOriginalAmount', color=base_color, estimator=np.sum);
plt.show()
# Average loan sizes by Employment Status
plt.figure(figsize=(20,5))
plt.subplot(1,2,1)
sb.barplot(data=loans_apr_nz, x='EmploymentStatus', y='LoanOriginalAmount', color=base_color, ci=95);
plt.xticks(rotation=15)
plt.subplot(1,2,2)
sb.barplot(data=loans_apr_nz, x='EmploymentStatus', y='LoanOriginalAmount', color=base_color, estimator=np.sum);
plt.xticks(rotation=15)
plt.show();
# Average loan sizes by Year
plt.figure(figsize=(20,5))
plt.subplot(1,2,1)
sb.barplot(data=loans_apr_nz, x='LoanOriginationYear', y='LoanOriginalAmount', color=base_color, ci=95);
plt.subplot(1,2,2)
sb.barplot(data=loans_apr_nz, x='LoanOriginationYear', y='LoanOriginalAmount', color=base_color, estimator=np.sum);
plt.show();
Observations: Some obvious trends here; loan aomunts are definitely higher for those with better credit ratings. The employed individuals too seem to be able to afford larger loans, in general. Also home owners have taken a larger loan by the looks of things.
sb.barplot(data=loans_apr_nz, x='CreditRating', y='StatedMonthlyIncome', color=base_color, ci=95);
Observations: Average monthly income is definitely proportional to the Credit Rating as well. Looks like those who earn more are able to maintain their credit ratings due to good repayment abilities.
plt.figure(figsize=(10,15))
i = 1
for feat in cat_feats:
plt.subplot(len(cat_feats), 1, i)
sb.barplot(data=loans_apr_nz, x=feat, y='Term', color=base_color, ci='sd');
i = i + 1
plt.show();
Observation Spread of terms is evenly distributed across the different categories.
There were some very obvious trends that BorrowerAPR reflected. As expected the BorrowerAPR was inversely correlated to increasing Credit Scores or Credit Ratings. BorrowerAPR also seemed to be correlated to the Loan Original Amount and as the Loan amounts got larger, the APR showed a reducing trend. APR vs Income was showing some sort of a trend, so it might be worthwhile plotting it in relation to other variables.
Borrowers who who have higher incomes, seem to have a higher appetite for credit and these individuals show an increasing trend in the number of Credit Lines they have as well as they tend to take larger Loan Amounts as loans.
Some very obvious relations we see are high incomes relate to taking bigger loans - this would be due to the capacity to pay off bigger loans. We see high credit ratngs related to higher loans - this might be in part to higher loans being offered to more credit worthy individuals. Part time employees have the lowest of all incomes, lowest amounts of loans taken, as well as lowest number of credit lines which makes a lot of sense, however they are on an average credit worthy. Tne relation of homeowners with credit ratings is very interesting as well seeing that higher credit ratings is more strongly correlated with being a homeowner.
loans_apr_sample = loans_apr_nz.sample(10000)
plt.figure(figsize=(15,10))
sb.scatterplot(data=loans_apr_sample, x='LoanOriginalAmount', y='BorrowerAPR', hue="CreditRating", palette="cubehelix_r")
plt.xscale('log')
xticks = [1000,1500,2000,3000,4000,5000,10000,15000,20000,25000,30000]
xlabels=['1k','1.5','2k','3k','4k','5k','10k','15k','20k','','30k']
plt.xticks(xticks, xticks)
plt.show();
loans_apr_sample = loans_apr_nz.sample(10000)
plt.figure(figsize=(15,10))
sb.scatterplot(data=loans_apr_sample, x='LoanOriginalAmount', y='BorrowerAPR', hue="CreditRating", style='IsBorrowerHomeowner', palette="cubehelix_r")
plt.xscale('log')
xticks = [1000,1500,2000,3000,4000,5000,10000,15000,20000,25000,30000]
xlabels=['1k','1.5','2k','3k','4k','5k','10k','15k','20k','','30k']
plt.xticks(xticks, xticks)
plt.show();
Observations: We can see that the upper bound of the APR shows a decreasing patterns with an increase in the Loan Amount. What can be clearly observed by overlaying the Credit Ratings on this plot, is that even irrespective of the size of the loan taken, credit worthy individuals tend to have a lower APR for the most part.
It is also worth nothing that there doesn't seem to be much impact of being a homeowner as far as the features of loan amount, credit rating are concerned.
loans_apr_sample = loans_apr_nz.query('LoanOriginalAmount >= 4000 and LoanOriginalAmount <= 5000')
loans_apr_sample = loans_apr_sample.sample(10000)
plt.figure(figsize=(15,10))
sb.scatterplot(data=loans_apr_sample, x='StatedMonthlyIncome', y='BorrowerAPR', hue="CreditRating",
palette="cubehelix_r", alpha=0.5)
# plt.xscale('log')
# xticks = [1000,1500,2000,3000,4000,5000,10000,15000,20000,25000,30000]
# xlabels=['1k','1.5','2k','3k','4k','5k','10k','15k','20k','','30k']
# plt.xticks(xticks, xticks)
plt.legend(loc="lower right")
plt.show();
Observation: It is difficult to see a consistent relation between the income and the APR even restricting the Loan Amount to a specific band.
loans_apr_sample = loans_apr_nz.sample(10000)
g = sb.FacetGrid(data=loans_apr_sample, col='CreditRating', col_wrap=3, height=4, aspect=1)
g.map(sb.scatterplot, 'LoanOriginalAmount', 'BorrowerAPR', color=base_color, alpha=0.4)
g.set(xscale='log')
xticks = [1000,3000,10000,30000]
xlabels=['1k','3k','10k','30k']
g.set(xticks=xticks, xticklabels=xlabels)
plt.show();
Observations: Drilling down using a FacetGrid we see that the APR spread does drop down and concentrate towards the lower sections of the APR values as the Credit Rating increases.
In Credit Ratings A and AA we see more of larger loans being taken. Whereas in Credit Ratings D and C we can actually see an even spread of interest rates and loan amounts being taken. Might be worth to dig deeper into what are the additional parameters here that decide on the APR.
loans_apr_sample = loans_apr_nz.query('CreditRating == "D" or CreditRating == "C"')
loans_apr_sample = loans_apr_sample.sample(10000)
plt.figure(figsize=(15,8))
sb.scatterplot(data=loans_apr_sample, x='LoanOriginalAmount', y='BorrowerAPR', hue='IsBorrowerHomeowner', palette='Paired')
xticks = [1000,1500,2000,3000,4000,5000,10000,15000,20000,25000,30000]
xlabels=['1k','1.5','2k','3k','4k','5k','10k','15k','20k','','30k']
plt.xscale('log')
xticks = [1000,3000,10000,30000]
xlabels=['1k','3k','10k','30k']
plt.xticks(xticks, xlabels)
plt.ylim(0.10, 0.40)
plt.show();
Observation: Being a homeowner does't seem to have a significant relationship at all. The plots are all over the place.
plt.figure(figsize=(15,8))
sb.scatterplot(data=loans_apr_sample, x='LoanOriginalAmount', y='BorrowerAPR', sizes=(2,200), alpha=0.6,
hue='StatedMonthlyIncome', size='StatedMonthlyIncome', palette='Blues')
xticks = [1000,1500,2000,3000,4000,5000,10000,15000,20000,25000,30000]
xlabels=['1k','1.5','2k','3k','4k','5k','10k','15k','20k','','30k']
plt.xscale('log')
xticks = [1000,3000,10000,30000]
xlabels=['1k','3k','10k','30k']
plt.xticks(xticks, xlabels)
plt.ylim(0.10, 0.40)
plt.show();
plt.figure(figsize=(15,8))
sb.scatterplot(data=loans_apr_sample.query('EmploymentStatusDuration < 400'), x='LoanOriginalAmount', y='BorrowerAPR', sizes=(2,200), alpha=0.6,
hue='EmploymentStatusDuration', size='EmploymentStatusDuration', palette='Blues')
xticks = [1000,1500,2000,3000,4000,5000,10000,15000,20000,25000,30000]
xlabels=['1k','1.5','2k','3k','4k','5k','10k','15k','20k','','30k']
plt.xscale('log')
xticks = [1000,3000,10000,30000]
xlabels=['1k','3k','10k','30k']
plt.xticks(xticks, xlabels)
plt.ylim(0.05, 0.40)
plt.show();
Observation: Generally it looks like lower incomes are taking up lower loan amounts and high incomes prefer higher amounts. However there is no clear relationship to the APR. What else might be causing the APR to vary so much within the same Credit Rating?
plt.figure(figsize=(15,8))
sb.scatterplot(data=loans_apr_sample, x='LoanOriginalAmount', y='BorrowerAPR', alpha=0.6,
hue='LoanOriginationYear', palette='Blues')
xticks = [1000,1500,2000,3000,4000,5000,10000,15000,20000,25000,30000]
xlabels=['1k','1.5','2k','3k','4k','5k','10k','15k','20k','','30k']
plt.xscale('log')
xticks = [1000,3000,10000,30000]
xlabels=['1k','3k','10k','30k']
plt.xticks(xticks, xlabels)
plt.ylim(0.10, 0.40)
plt.show();
Observation: Loan origination year still seems to be very relevant in this chart. For certain years, there is a clear band in which the APR falls
We can see that there is a strong correlation between credit rating as well as loan amount. So by selecting data keeping these two constant let's try to see if we can draw out any more information. We'll select the loan range as 4000-5000 where we see a peak in the distribution chart.
loans_apr_range = loans_apr_nz.query('CreditRating == "D" or CreditRating == "C"')
loans_apr_range = loans_apr_range.query('LoanOriginalAmount >= 4000 & LoanOriginalAmount <= 5000')
loans_apr_range.shape[0]
# checking if we have a range that we can work with
bin_edges = np.arange(0,0.55,0.01)
plt.hist(loans_apr_range['BorrowerAPR'], bins=bin_edges)
plt.xlim(0.1, 0.35)
plt.show();
Observation: This is a range that we can work with. Interestingly we see that the APR gradually climbs, however it drops off sharply after 0.30 to about 0.32
g = sb.FacetGrid(data=loans_apr_range, col='EmploymentStatus', col_wrap=2, height=4, aspect=2)
g.map(sb.scatterplot, 'StatedMonthlyIncome', 'BorrowerAPR')
g.set(ylim=(0.1,0.35))
plt.show();
Observation: Employment status still doesn't seem to have a strong correlation even after all the filters we have applied. We could use the Full-time employees as a filter to determine other correlations.
g = sb.FacetGrid(data=loans_apr_range, col='LoanOriginationYear', col_wrap=2, height=4, aspect=2)
g.map(sb.scatterplot, 'StatedMonthlyIncome', 'BorrowerAPR')
g.set(ylim=(0.1,0.35))
plt.show();
loans_apr_range_year = loans_apr_range.query('LoanOriginationYear > 2006 & LoanOriginationYear < 2014')
g = sb.FacetGrid(data=loans_apr_range_year, col='EmploymentStatus', hue='LoanOriginationYear',
col_wrap=2, height=4, aspect=2, palette='viridis')
g.map(sb.scatterplot, 'StatedMonthlyIncome', 'BorrowerAPR')
g.set(ylim=(0.1,0.35))
g.add_legend()
plt.show();
Observations: Certain very interesting patterns emerge here.
loans_apr_2013 = loans_apr_nz.query('LoanOriginationYear == 2013')
# feature eng - income per credit line
loans_apr_2013.loc[:,'IncomePerCreditLine'] = loans_apr_2013.loc[:,'StatedMonthlyIncome'] / loans_apr_2013.loc[:,'CurrentCreditLines']
loans_apr_2013.head()
plt.figure(figsize=(15,8))
sb.scatterplot(data=loans_apr_2013, x='IncomePerCreditLine', y='BorrowerAPR', hue='CreditRating', palette='cubehelix_r')
plt.xlim(0,3000)
plt.show();
plt.figure(figsize=(15,8))
sb.scatterplot(data=loans_apr_2013, x='StatedMonthlyIncome', y='BorrowerAPR', hue='CreditRating', palette='cubehelix_r')
plt.show();
plt.figure(figsize=(15,8))
sb.scatterplot(data=loans_apr_2013, x='LoanOriginalAmount', y='BorrowerAPR', hue='CreditRating', palette='cubehelix_r')
plt.show();
g = sb.FacetGrid(data=loans_apr_nz, col='LoanOriginationYear', hue='CreditRating', palette='cubehelix_r',
col_wrap=2, height=4, aspect=2)
g.map(sb.scatterplot, 'LoanOriginalAmount', 'BorrowerAPR')
g.add_legend()
plt.show();
Observations: Credit rating very clearly dominates the relationship to BorrowerAPR. It is made more clear by scoping it down to the year where you can see clear separations of bands of APRs. It is also interesting to see the relations to the max loan amount offered. Lower credit ratings probably cannot afford larger loans, or aren't sanctioned more loans. Whereas high credit ratings, probably don't need larger loans. Hence the meat of the distribution is in the band D, C, B. It can also be seen how the separation between bands is clearly coming out over the years, probably as the calculations and predictions of CreditRatings are becoming more data driven and the algorithms are being more optimized.
plt.figure(figsize=(15,10))
loans_apr_sample = loans_apr_nz.query('CurrentCreditLines <= 30')
loans_apr_sample = loans_apr_sample.sample(10000)
sb.scatterplot(data=loans_apr_sample, y='StatedMonthlyIncome', x='LoanOriginalAmount',
hue='CurrentCreditLines', size='CurrentCreditLines', sizes=(2,200), palette='Blues')
plt.xscale('log')
plt.show();
# Let's filter the data to IQRs of each feature.
q1_income = loans_apr_nz.StatedMonthlyIncome.quantile(.25)
q3_income = loans_apr_nz.StatedMonthlyIncome.quantile(.75)
q1_loan = loans_apr_nz.LoanOriginalAmount.quantile(.25)
q3_loan = loans_apr_nz.LoanOriginalAmount.quantile(.75)
q1_lines = loans_apr_nz.CurrentCreditLines.quantile(.25)
q3_lines = loans_apr_nz.CurrentCreditLines.quantile(.75)
loans_apr_quant = loans_apr_nz.copy()
loans_apr_quant = loans_apr_quant.query('StatedMonthlyIncome >= @q1_income and StatedMonthlyIncome <= @q3_income')
loans_apr_quant = loans_apr_quant.query('LoanOriginalAmount >= @q1_loan and LoanOriginalAmount <= @q3_loan')
loans_apr_quant = loans_apr_quant.query('CurrentCreditLines >= @q1_lines and CurrentCreditLines <= @q3_lines')
plt.figure(figsize=(15,10))
sb.scatterplot(data=loans_apr_quant.sample(10000), y='StatedMonthlyIncome', x='LoanOriginalAmount', hue='CurrentCreditLines',
size='CurrentCreditLines', sizes=(2,200), palette='Blues')
plt.xscale('log')
plt.show();
Observation: Relationship doesn't come out very obviously. By plotting within the boundaries of the IQR, the relations are all over the place and nothing concrete can be established. This still shows how important the Credit Rating is.
Using the entire dataset, it does look like there is a correlation between Loan Amount and Monthly Income. Also within a single value for Loan Amount the number of Credit Lines does seem to increase with Monthly Income.
plt.figure(figsize=(20,10))
sb.barplot(data=loans_apr_nz, y='BorrowerAPR', x='LoanOriginationYear',
hue='CreditRating', palette='Blues')
plt.show();
plt.figure(figsize=(20,10))
sb.pointplot(data=loans_apr_nz, y='BorrowerAPR', x='LoanOriginationYear', ci=90,
hue='CreditRating', palette='Blues', dodge=-0.3)
plt.show();
plt.figure(figsize=(20,8))
sb.boxplot(data=loans_apr_nz, x='CreditRating', y='BorrowerAPR', hue='LoanOriginationYear', palette='cubehelix_r')
plt.show();